# Cleanup APNs for merge from DINS to historical assessment data
pacman::p_load(stringdist, data.table, fst)

rm(list = ls())

source("code/globals.R")

dins <- readRDS(file.path(WORKING, "dins_records.Rds"))

dins <- data.table(dins)

# 2013 and 2014 incidents
dins13 <- dins[dins$date < as.Date("2015-01-01"), ]
# 2015 and early 2016 incidents
dins15 <- dins[dins$date >= as.Date("2015-01-01") &
  dins$date < as.Date("2016-09-01"), ]

attr13 <- read_fst(file.path(WORKING, "ztraxdata", "CAattr2013.fst"), as.data.table = TRUE)
attr15 <- read_fst(file.path(WORKING, "ztraxdata", "CAattr2015.fst"), as.data.table = TRUE)

## Confirm unique IDs in all datasets
anyDuplicated(dins13[, c("FIPS", "UnformattedAssessorParcelNumber")])
anyDuplicated(dins15[, c("FIPS", "UnformattedAssessorParcelNumber")])
anyDuplicated(attr13[, c("ImportParcelID", "BuildingOrImprovementNumber")])
anyDuplicated(attr15[, c("ImportParcelID", "BuildingOrImprovementNumber")])


## Function to inspect IDs by county in each dataset. Looks for approx string matches on address; then use that to compare APNs
inspectAPN <- function(targetfips, dinsfile, attrfile) {
  dins <- dinsfile
  attr <- attrfile
  dd <- dins[dins$FIPS == targetfips, c(
    "APN", "UnformattedAssessorParcelNumber", "STREETNUMBER",
    "STREETNAME", "STREETTYPE", "STREETSUFFIX", "CITY", "STRUCTURETYPE",
    "howmanystructures", "badrecordflag"
  )]
  dd <- dd[order(dd$STREETNAME, dd$STREETNUMBER), ]
  dd$address <- tolower(paste(dd$STREETNUMBER, dd$STREETNAME, dd$STREETTYPE, sep = " "))
  aa <- attr[attr$FIPS == targetfips, c(
    "BuildingOrImprovementNumber",
    "UnformattedAssessorParcelNumber", "PropertyFullStreetAddress",
    "NoOfUnits", "PropertyLandUseStndCode", "PropertyStreetName", "PropertyHouseNumber", "PropertyStreetSuffix"
  )]
  aa <- aa[order(aa$PropertyStreetName, aa$PropertyHouseNumber), ]
  aa$address <- tolower(paste(aa$PropertyHouseNumber, aa$PropertyStreetName, aa$PropertyStreetSuffix))
  names(dd)[names(dd) == "UnformattedAssessorParcelNumber"] <- "UnformattedAPN_DINS"
  names(dd)[names(dd) == "APN"] <- "APN_DINS"
  names(dd)[names(dd) == "address"] <- "address_DINS"
  ## For 100 random fire records, find closest address string in ZTrax
  set.seed(16473)
  dd$rand <- runif(nrow(dd))
  dd <- dd[order(dd$rand), ]
  if (nrow(dd) > 100) {
    dd <- dd[1:100, ]
  }
  table <- data.frame(matrix(nrow = nrow(dd), ncol = 10, data = NA))
  ## Going house by house in the DINS sample, subset Ztrax to only records with identical house NUMBER, then find closest string for full address.
  for (i in seq(1, nrow(dd))) {
    exacthousenumbermatches <- aa[aa$PropertyHouseNumber == dd[i, STREETNUMBER]]
    bestmatch <- amatch(dd[i, address_DINS], exacthousenumbermatches$address, maxDist = 100, method = "dl")
    tablerow <- cbind(
      dd[i, c("UnformattedAPN_DINS", "APN_DINS", "address_DINS", "STRUCTURETYPE", "howmanystructures", "badrecordflag")],
      exacthousenumbermatches[bestmatch, c("PropertyFullStreetAddress", "UnformattedAssessorParcelNumber")]
    )
    tablerow$stringdist <- stringdist(tolower(tablerow$address_DINS), tolower(tablerow$PropertyFullStreetAddress), method = "dl")
    table[i, ] <- tablerow
  }
  names(table) <- names(tablerow)
  table <- table[, c(
    "stringdist", "STRUCTURETYPE", "UnformattedAssessorParcelNumber", "UnformattedAPN_DINS",
    "PropertyFullStreetAddress",
    "address_DINS", "howmanystructures", "badrecordflag"
  )]
  table <- table[order(table$stringdist), ]
  table$lengthUAPN_DINS <- nchar(table$UnformattedAPN_DINS)
  table$lengthUAPN_Ztrax <- nchar(table$UnformattedAssessorParcelNumber)
  return(table)
}

## Function to check how many string address matches are perfectly match on UAPN
checkAPN <- function(targetfips, dinsfile, attrfile) {
  v <- inspectAPN(targetfips, dinsfile, attrfile)
  v <- v[v$stringdist <= 10 & !is.na(v$stringdist), ]
  print(nrow(v[v$UnformattedAssessorParcelNumber == v$UnformattedAPN_DINS, ]) / nrow(v))
  v <- v[v$stringdist <= 8 & !is.na(v$stringdist), ]
  print(nrow(v[v$UnformattedAssessorParcelNumber == v$UnformattedAPN_DINS, ]) / nrow(v))
}


#### Some basic operations for reformatting APN numbers.
# 1. Replace dashes.
nodash <- function(dinsfile, fips) {
  dinsfile$UnformattedAssessorParcelNumber[dinsfile$FIPS == fips] <-
    gsub("-", "", dinsfile$UnformattedAssessorParcelNumber, fixed = TRUE)[dinsfile$FIPS == fips]
  return(dinsfile)
}

# 2. Substring
usesubset <- function(dinsfile, fips, start, end) {
  dinsfile$UnformattedAssessorParcelNumber[dinsfile$FIPS == fips] <-
    substr(dinsfile$UnformattedAssessorParcelNumber, start, end)[dinsfile$FIPS == fips]
  return(dinsfile)
}

# 3. Pre-append a zero
pre0 <- function(dinsfile, fips) {
  dinsfile$UnformattedAssessorParcelNumber[dinsfile$FIPS == fips] <-
    paste0("0", dinsfile$UnformattedAssessorParcelNumber[dinsfile$FIPS == fips])
  return(dinsfile)
}

# 4. Post-append a zero
post0 <- function(dinsfile, fips) {
  dinsfile$UnformattedAssessorParcelNumber[dinsfile$FIPS == fips] <-
    paste0(dinsfile$UnformattedAssessorParcelNumber[dinsfile$FIPS == fips], "0")
  return(dinsfile)
}


### 2013 and 2014 INCIDENTS IN DINS ###########

b <- dins13[, .N, by = FIPS]
b <- b[order(as.numeric(b$FIPS)), ]
b$FIPS
rm(b)

# 6045 -- No changes needed
inspectAPN(6045, dins13, attr13)
checkAPN(6045, dins13, attr13)

# 6065 -- No changes needed
inspectAPN(6065, dins13, attr13)
checkAPN(6065, dins13, attr13)

# 6073 -- No changes needed
inspectAPN(6073, dins13, attr13)
checkAPN(6073, dins13, attr13)

# 6089
inspectAPN(6089, dins13, attr13)
## Drop trailing 3 zeros
dins13 <- usesubset(dins13, 6089, 1, 9)
## Strip the zero in position 6
stopifnot(substr(dins13$UnformattedAssessorParcelNumber[dins13$FIPS == 6089], 6, 6) == 0)
stopifnot(nchar(dins13$UnformattedAssessorParcelNumber[dins13$FIPS == 6089]) == 9)
dins13$UnformattedAssessorParcelNumber[dins13$FIPS == 6089] <-
  paste0(
    substr(dins13$UnformattedAssessorParcelNumber[dins13$FIPS == 6089], 1, 5),
    substr(dins13$UnformattedAssessorParcelNumber[dins13$FIPS == 6089], 7, 9)
  )
## Confirm none of this loses information in DINS (APN still identifies unique property?)
anyDuplicated(dins13$UnformattedAssessorParcelNumber[dins13$FIPS == 6089])
checkAPN(6089, dins13, attr13)

# 6093 ### - Only 1 destroyed structure (plus 1 minor damage), and no APN info in DINS. Drop.
inspectAPN(6093, dins13, attr13)
dins13 <- dins13[!dins13$FIPS == 6093, ]

### 2015 AND EARLY 2016 INCIDENTS CLEANUP

b <- dins15[, .N, by = FIPS]
b <- b[order(as.numeric(b$FIPS)), ]
b$FIPS
rm(b)

# 6009 -
inspectAPN(6009, dins15, attr15)
table(nchar(dins15$UnformattedAssessorParcelNumber[dins15$FIPS == 6009]), useNA = "always")
# Pre-append a zero to DINS
dins15 <- pre0(dins15, 6009)
checkAPN(6009, dins15, attr15)

# 6019 - appears to be erroenous append of 's' onto one of the APNS in DINS - otherwise no changes needed
## -- I looked at all APNs in ZTRAX for this county file and none of them end in 's' or 'S'.
inspectAPN(6019, dins15, attr15)
dins15$UnformattedAssessorParcelNumber[dins15$FIPS == 6019 &
  dins15$UnformattedAssessorParcelNumber == "12811126S"] <- "12811126"
checkAPN(6019, dins15, attr15)

# 6029 - no changes required
inspectAPN(6029, dins15, attr15)
checkAPN(6029, dins15, attr15)

# 6033
inspectAPN(6033, dins15, attr15)
inspectAPN(6033, dins15[dins15$INCIDENTNAME != "Valley", ], attr15)
inspectAPN(6033, dins15[dins15$INCIDENTNAME == "Valley", ], attr15)
## remove dashes
dins15 <- nodash(dins15, 6033)

# ZILLOW-SIDE CHANGE: STRIP TRAILING TWO DIGITS IF LENGTH IS TOO LONG - some have "01" or "02" appended - most but not all are mobile homes.
# I strip these digits for properties EXCEPT for mobile homes (PropertyLandUseStndCode RR103).
table(nchar(attr15$UnformattedAssessorParcelNumber[attr15$FIPS == 6033]))
table(nchar(dins15$UnformattedAssessorParcelNumber[dins15$FIPS == 6033]))
y <- which(attr15$FIPS == 6033 & nchar(attr15$UnformattedAssessorParcelNumber) %in% c(9, 10) & attr15$PropertyLandUseStndCode != "RR103")
attr15$UnformattedAssessorParcelNumber[y] <- substr(attr15$UnformattedAssessorParcelNumber[y], 1, 8)
rm(y)
# Do APNs in ZTRAX still identify properties for the subset that are in DINS?
#- No - this truncation creates a small number (6) of perfect duplicate APNS on the Zillow side. They appear to be true duplicates, based on street address.
#- The older (2013) records are more complete for year built than the newer (2014) records, so I keep the older ones.
#- UPDATE: I remove the duplicate UAPNs whether they are in DINS or not. Necessary to match to parcel shapefile.
#- First, identify the duplicated rows to be removed
v <- which(attr15$FIPS == 6033 &
  # attr15$UnformattedAssessorParcelNumber %in% dins15$UnformattedAssessorParcelNumber[dins15$FIPS==6033] &
  (duplicated(attr15[, c("FIPS", "UnformattedAssessorParcelNumber")]) | duplicated(attr15[, c("FIPS", "UnformattedAssessorParcelNumber")], fromLast = TRUE)))
z <- attr15[v, .(RowID, UnformattedAssessorParcelNumber, PropertyLandUseStndCode, PropertyFullStreetAddress, ExtractDate, Edition, date, YearBuilt)]
z <- z[order(UnformattedAssessorParcelNumber, date), ]
z[, copy := seq_len(.N), by = UnformattedAssessorParcelNumber]
z[, numcopies := .N, by = UnformattedAssessorParcelNumber]
z <- z[copy != 1, ] # Keep EARLIEST because the 2013 are more complete for year built than the 2014.
#- Then, strip those rows out of the main dataset.
attr15 <- attr15[!attr15$RowID %in% z$RowID, ]
rm(z)

checkAPN(6033, dins15, attr15)
checkAPN(6033, dins15[dins15$INCIDENTNAME == "Valley", ], attr15)

# 6051
inspectAPN(6051, dins15, attr15)
checkAPN(6051, dins15, attr15)

# 6053
inspectAPN(6053, dins15, attr15)
# Trim 3 trailing zeros from DINS
stopifnot(nchar(dins15$UnformattedAssessorParcelNumber[dins15$FIPS == 6053]) == 12)
stopifnot(substr(dins15$UnformattedAssessorParcelNumber[dins15$FIPS == 6053], 10, 12) == "000")
dins15 <- usesubset(dins15, 6053, 1, 9)
checkAPN(6053, dins15, attr15)

# 6055
inspectAPN(6055, dins15, attr15)
dins15 <- usesubset(dins15, 6055, 1, 9)
checkAPN(6055, dins15, attr15)

# 6079
inspectAPN(6079, dins15, attr15)
dins15 <- nodash(dins15, 6079)
checkAPN(6079, dins15, attr15)

# 6093
inspectAPN(6093, dins15, attr15)
dins15 <- nodash(dins15, 6093)
checkAPN(6093, dins15, attr15)

# 6097 - Only 1 home in DINS, missing street number information (0 given for house number). Drop.
inspectAPN(6097, dins15, attr15)
dins15 <- dins15[!dins15$FIPS == 6097, ]

write_fst(dins13, file.path(WORKING, "dins13_cleaned.fst"))
write_fst(dins15, file.path(WORKING, "dins15_cleaned.fst"))
write_fst(attr13, file.path(WORKING, "attr13_cleaned.fst"))
write_fst(attr15, file.path(WORKING, "attr15_cleaned.fst"))
